Combine data files with state populations

The first data file has 2000-2010



In [2]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from os.path import join
cwd = os.getcwd()
data_directory = join(cwd, '..', 'Data storage')

Import 2000-2010 data

The sex, origin, race, and age columns are ALL when they have values of 0

Not clear if these are beginning or end of year values.

https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/state/



In [3]:

    
path = os.path.join(data_directory, 'Population data',
                    'st-est00int-alldata.csv')
pop1 = pd.read_csv(path)



In [4]:

    
pop1.head()









    Out[4]:







  
    
      
      REGION
      DIVISION
      STATE
      NAME
      SEX
      ORIGIN
      RACE
      AGEGRP
      ESTIMATESBASE2000
      POPESTIMATE2000
      ...
      POPESTIMATE2002
      POPESTIMATE2003
      POPESTIMATE2004
      POPESTIMATE2005
      POPESTIMATE2006
      POPESTIMATE2007
      POPESTIMATE2008
      POPESTIMATE2009
      CENSUS2010POP
      POPESTIMATE2010
    
  
  
    
      0
      0
      0
      0
      United States
      0
      0
      0
      0
      281424600
      282162411
      ...
      287625193
      290107933
      292805298
      295516599
      298379912
      301231207
      304093966
      306771529
      308745538
      309349689
    
    
      1
      0
      0
      0
      United States
      0
      0
      0
      1
      19176154
      19178293
      ...
      19429192
      19592446
      19785885
      19917400
      19938883
      20125962
      20271127
      20244518
      20201362
      20200529
    
    
      2
      0
      0
      0
      United States
      0
      0
      0
      2
      20549855
      20463852
      ...
      19872417
      19620851
      19454237
      19389067
      19544688
      19714611
      19929602
      20182499
      20348657
      20382409
    
    
      3
      0
      0
      0
      United States
      0
      0
      0
      3
      20528425
      20637696
      ...
      21261421
      21415353
      21411680
      21212579
      21033138
      20841042
      20706655
      20660564
      20677194
      20694011
    
    
      4
      0
      0
      0
      United States
      0
      0
      0
      4
      20218782
      20294955
      ...
      20610370
      20797166
      21102552
      21486214
      21807709
      22067816
      22210880
      22192810
      22040343
      21959087
    
  

5 rows × 21 columns



In [5]:

    
pop1 = pop1.loc[(pop1['SEX'] == 0) &
                (pop1['ORIGIN'] == 0) &
                (pop1['RACE'] == 0) &
                (pop1['AGEGRP'] == 0), :]



In [6]:

    
# Column names for population estimate
est_cols = ['POPESTIMATE{}'.format(x) for x in range(2000, 2011)]



In [7]:

    
# Melt the wide-form data into a tidy dataframe
pop1_tidy = pd.melt(pop1, id_vars='NAME',
                    value_vars=est_cols, var_name='Year',
                    value_name='Population')



In [8]:

    
pop1_tidy.head()









    Out[8]:







  
    
      
      NAME
      Year
      Population
    
  
  
    
      0
      United States
      POPESTIMATE2000
      282162411
    
    
      1
      Alabama
      POPESTIMATE2000
      4452173
    
    
      2
      Alaska
      POPESTIMATE2000
      627963
    
    
      3
      Arizona
      POPESTIMATE2000
      5160586
    
    
      4
      Arkansas
      POPESTIMATE2000
      2678588



In [9]:

    
def map_year(x):
    'Return last 4 characters (the year)'
    year = x[-4:]
    return int(year)

pop1_tidy['Year'] = pop1_tidy['Year'].map(map_year)

The values shown below are ever slightly different than those listed in the later dataset.



In [10]:

    
pop1_tidy.loc[pop1_tidy['Year'] == 2010].head()









    Out[10]:







  
    
      
      NAME
      Year
      Population
    
  
  
    
      520
      United States
      2010
      309349689
    
    
      521
      Alabama
      2010
      4785298
    
    
      522
      Alaska
      2010
      713985
    
    
      523
      Arizona
      2010
      6413737
    
    
      524
      Arkansas
      2010
      2921606



In [11]:

    
pop1_tidy.head()









    Out[11]:







  
    
      
      NAME
      Year
      Population
    
  
  
    
      0
      United States
      2000
      282162411
    
    
      1
      Alabama
      2000
      4452173
    
    
      2
      Alaska
      2000
      627963
    
    
      3
      Arizona
      2000
      5160586
    
    
      4
      Arkansas
      2000
      2678588



In [12]:

    
pop1_tidy.tail()









    Out[12]:







  
    
      
      NAME
      Year
      Population
    
  
  
    
      567
      Virginia
      2010
      8024617
    
    
      568
      Washington
      2010
      6744496
    
    
      569
      West Virginia
      2010
      1853973
    
    
      570
      Wisconsin
      2010
      5691047
    
    
      571
      Wyoming
      2010
      564460



In [13]:

    
pop1_tidy.columns = ['State', 'Year', 'Population']

Import 2010-2016 data

https://www.census.gov/data/tables/2016/demo/popest/state-total.html



In [14]:

    
path = os.path.join(data_directory, 'Population data', 'nst-est2016-01.xlsx')
pop2 = pd.read_excel(path, header=3, parse_cols='A, D:J', skip_footer=7)



In [15]:

    
pop2.head()









    Out[15]:







  
    
      
      2010
      2011
      2012
      2013
      2014
      2015
      2016
    
  
  
    
      United States
      309348193
      311663358
      313998379
      316204908
      318563456
      320896618
      323127513
    
    
      Northeast
      55388056
      55632766
      55829059
      55988771
      56116791
      56184737
      56209510
    
    
      Midwest
      66978602
      67153331
      67332320
      67543948
      67726368
      67838387
      67941429
    
    
      South
      114863114
      116061801
      117299171
      118424320
      119696311
      121039206
      122319574
    
    
      West
      72118421
      72815460
      73537829
      74247869
      75023986
      75834288
      76657000



In [16]:

    
pop2.tail()









    Out[16]:







  
    
      
      2010
      2011
      2012
      2013
      2014
      2015
      2016
    
  
  
    
      .Virginia
      8025773
      8110035
      8192048
      8262692
      8317372
      8367587
      8411808
    
    
      .Washington
      6743226
      6822520
      6895226
      6968006
      7054196
      7160290
      7288000
    
    
      .West Virginia
      1854230
      1854972
      1856560
      1853231
      1848514
      1841053
      1831102
    
    
      .Wisconsin
      5690263
      5709640
      5726177
      5742854
      5758377
      5767891
      5778708
    
    
      .Wyoming
      564513
      567725
      576765
      582684
      583642
      586555
      585501



In [17]:

    
drop_rows = ['Northeast', 'Midwest', 'South', 'West']
pop2.drop(drop_rows, inplace=True)



In [19]:

    
pop2.index = pop2.index.str.strip('.')



In [20]:

    
pop2.head()









    Out[20]:







  
    
      
      2010
      2011
      2012
      2013
      2014
      2015
      2016
    
  
  
    
      United States
      309348193
      311663358
      313998379
      316204908
      318563456
      320896618
      323127513
    
    
      Alabama
      4785492
      4799918
      4815960
      4829479
      4843214
      4853875
      4863300
    
    
      Alaska
      714031
      722713
      731089
      736879
      736705
      737709
      741894
    
    
      Arizona
      6408312
      6467163
      6549634
      6624617
      6719993
      6817565
      6931071
    
    
      Arkansas
      2921995
      2939493
      2950685
      2958663
      2966912
      2977853
      2988248



In [21]:

    
pop2.columns









    Out[21]:





Int64Index([2010, 2011, 2012, 2013, 2014, 2015, 2016], dtype='int64')



In [22]:

    
pop2_tidy = pd.melt(pop2.reset_index(), id_vars='index',
                    value_vars=range(2010, 2017), value_name='Population',
                    var_name='Year')
pop2_tidy.columns = ['State', 'Year', 'Population']

Combine data



In [23]:

    
pop_total = pd.concat([pop1_tidy, pop2_tidy])

The overlapping 2010 values are different, but just barely. I'm going to re-combine the datasets and keep values from the second dataset.



In [24]:

    
pop_total.loc[pop_total['Year']==2010].sort_values('State')









    Out[24]:







  
    
      
      State
      Year
      Population
    
  
  
    
      1
      Alabama
      2010
      4785492
    
    
      521
      Alabama
      2010
      4785298
    
    
      522
      Alaska
      2010
      713985
    
    
      2
      Alaska
      2010
      714031
    
    
      523
      Arizona
      2010
      6413737
    
    
      3
      Arizona
      2010
      6408312
    
    
      524
      Arkansas
      2010
      2921606
    
    
      4
      Arkansas
      2010
      2921995
    
    
      525
      California
      2010
      37349363
    
    
      5
      California
      2010
      37332685
    
    
      526
      Colorado
      2010
      5049071
    
    
      6
      Colorado
      2010
      5048644
    
    
      527
      Connecticut
      2010
      3577073
    
    
      7
      Connecticut
      2010
      3579899
    
    
      528
      Delaware
      2010
      899769
    
    
      8
      Delaware
      2010
      899816
    
    
      529
      District of Columbia
      2010
      604453
    
    
      9
      District of Columbia
      2010
      605183
    
    
      530
      Florida
      2010
      18843326
    
    
      10
      Florida
      2010
      18849098
    
    
      11
      Georgia
      2010
      9713521
    
    
      531
      Georgia
      2010
      9712587
    
    
      532
      Hawaii
      2010
      1363621
    
    
      12
      Hawaii
      2010
      1363945
    
    
      13
      Idaho
      2010
      1571010
    
    
      533
      Idaho
      2010
      1571450
    
    
      14
      Illinois
      2010
      12841578
    
    
      534
      Illinois
      2010
      12843166
    
    
      15
      Indiana
      2010
      6490528
    
    
      535
      Indiana
      2010
      6490621
    
    
      ...
      ...
      ...
      ...
    
    
      558
      Oregon
      2010
      3838957
    
    
      38
      Oregon
      2010
      3838048
    
    
      39
      Pennsylvania
      2010
      12712343
    
    
      559
      Pennsylvania
      2010
      12709630
    
    
      40
      Rhode Island
      2010
      1053337
    
    
      560
      Rhode Island
      2010
      1052886
    
    
      41
      South Carolina
      2010
      4635943
    
    
      561
      South Carolina
      2010
      4636312
    
    
      42
      South Dakota
      2010
      816325
    
    
      562
      South Dakota
      2010
      816463
    
    
      43
      Tennessee
      2010
      6356671
    
    
      563
      Tennessee
      2010
      6356897
    
    
      564
      Texas
      2010
      25257114
    
    
      44
      Texas
      2010
      25244310
    
    
      0
      United States
      2010
      309348193
    
    
      520
      United States
      2010
      309349689
    
    
      45
      Utah
      2010
      2775326
    
    
      565
      Utah
      2010
      2776469
    
    
      566
      Vermont
      2010
      625960
    
    
      46
      Vermont
      2010
      625982
    
    
      47
      Virginia
      2010
      8025773
    
    
      567
      Virginia
      2010
      8024617
    
    
      48
      Washington
      2010
      6743226
    
    
      568
      Washington
      2010
      6744496
    
    
      49
      West Virginia
      2010
      1854230
    
    
      569
      West Virginia
      2010
      1853973
    
    
      50
      Wisconsin
      2010
      5690263
    
    
      570
      Wisconsin
      2010
      5691047
    
    
      571
      Wyoming
      2010
      564460
    
    
      51
      Wyoming
      2010
      564513
    
  

104 rows × 3 columns



In [25]:

    
pop_total = pd.concat([pop1_tidy.loc[pop1_tidy['Year'] < 2010], pop2_tidy])



In [26]:

    
pop_total.head()









    Out[26]:







  
    
      
      State
      Year
      Population
    
  
  
    
      0
      United States
      2000
      282162411
    
    
      1
      Alabama
      2000
      4452173
    
    
      2
      Alaska
      2000
      627963
    
    
      3
      Arizona
      2000
      5160586
    
    
      4
      Arkansas
      2000
      2678588



In [27]:

    
pop_total.tail()









    Out[27]:







  
    
      
      State
      Year
      Population
    
  
  
    
      359
      Virginia
      2016
      8411808
    
    
      360
      Washington
      2016
      7288000
    
    
      361
      West Virginia
      2016
      1831102
    
    
      362
      Wisconsin
      2016
      5778708
    
    
      363
      Wyoming
      2016
      585501



In [81]:

    
path = os.path.join('Data storage', 'Derived data', 'State population.csv')
pop_total.to_csv(path, index=False)

	NAME	AGEGRP	ESTIMATESBASE2000	POPESTIMATE2000	...	POPESTIMATE2002	POPESTIMATE2003	POPESTIMATE2004	POPESTIMATE2005	POPESTIMATE2006	POPESTIMATE2007	POPESTIMATE2008	POPESTIMATE2009	CENSUS2010POP	POPESTIMATE2010
0	United States	0	281424600	282162411	...	287625193	290107933	292805298	295516599	298379912	301231207	304093966	306771529	308745538	309349689
1	United States	1	19176154	19178293	...	19429192	19592446	19785885	19917400	19938883	20125962	20271127	20244518	20201362	20200529
2	United States	2	20549855	20463852	...	19872417	19620851	19454237	19389067	19544688	19714611	19929602	20182499	20348657	20382409
3	United States	3	20528425	20637696	...	21261421	21415353	21411680	21212579	21033138	20841042	20706655	20660564	20677194	20694011
4	United States	4	20218782	20294955	...	20610370	20797166	21102552	21486214	21807709	22067816	22210880	22192810	22040343	21959087

	NAME	Year	Population
0	United States	POPESTIMATE2000	282162411
1	Alabama	POPESTIMATE2000	4452173
2	Alaska	POPESTIMATE2000	627963
3	Arizona	POPESTIMATE2000	5160586
4	Arkansas	POPESTIMATE2000	2678588

	NAME	Year	Population
520	United States	2010	309349689
521	Alabama	2010	4785298
522	Alaska	2010	713985
523	Arizona	2010	6413737
524	Arkansas	2010	2921606

	NAME	Year	Population
0	United States	2000	282162411
1	Alabama	2000	4452173
2	Alaska	2000	627963
3	Arizona	2000	5160586
4	Arkansas	2000	2678588

	NAME	Year	Population
567	Virginia	2010	8024617
568	Washington	2010	6744496
569	West Virginia	2010	1853973
570	Wisconsin	2010	5691047
571	Wyoming	2010	564460

	2010	2011	2012	2013	2014	2015	2016
United States	309348193	311663358	313998379	316204908	318563456	320896618	323127513
Northeast	55388056	55632766	55829059	55988771	56116791	56184737	56209510
Midwest	66978602	67153331	67332320	67543948	67726368	67838387	67941429
South	114863114	116061801	117299171	118424320	119696311	121039206	122319574
West	72118421	72815460	73537829	74247869	75023986	75834288	76657000

	2010	2011	2012	2013	2014	2015	2016
.Virginia	8025773	8110035	8192048	8262692	8317372	8367587	8411808
.Washington	6743226	6822520	6895226	6968006	7054196	7160290	7288000
.West Virginia	1854230	1854972	1856560	1853231	1848514	1841053	1831102
.Wisconsin	5690263	5709640	5726177	5742854	5758377	5767891	5778708
.Wyoming	564513	567725	576765	582684	583642	586555	585501

	State	Year	Population
1	Alabama	2010	4785492
521	Alabama	2010	4785298
522	Alaska	2010	713985
2	Alaska	2010	714031
523	Arizona	2010	6413737
3	Arizona	2010	6408312
524	Arkansas	2010	2921606
4	Arkansas	2010	2921995
525	California	2010	37349363
5	California	2010	37332685
526	Colorado	2010	5049071
6	Colorado	2010	5048644
527	Connecticut	2010	3577073
7	Connecticut	2010	3579899
528	Delaware	2010	899769
8	Delaware	2010	899816
529	District of Columbia	2010	604453
9	District of Columbia	2010	605183
530	Florida	2010	18843326
10	Florida	2010	18849098
11	Georgia	2010	9713521
531	Georgia	2010	9712587
532	Hawaii	2010	1363621
12	Hawaii	2010	1363945
13	Idaho	2010	1571010
533	Idaho	2010	1571450
14	Illinois	2010	12841578
534	Illinois	2010	12843166
15	Indiana	2010	6490528
535	Indiana	2010	6490621
...	...	...	...
558	Oregon	2010	3838957
38	Oregon	2010	3838048
39	Pennsylvania	2010	12712343
559	Pennsylvania	2010	12709630
40	Rhode Island	2010	1053337
560	Rhode Island	2010	1052886
41	South Carolina	2010	4635943
561	South Carolina	2010	4636312
42	South Dakota	2010	816325
562	South Dakota	2010	816463
43	Tennessee	2010	6356671
563	Tennessee	2010	6356897
564	Texas	2010	25257114
44	Texas	2010	25244310
0	United States	2010	309348193
520	United States	2010	309349689
45	Utah	2010	2775326
565	Utah	2010	2776469
566	Vermont	2010	625960
46	Vermont	2010	625982
47	Virginia	2010	8025773
567	Virginia	2010	8024617
48	Washington	2010	6743226
568	Washington	2010	6744496
49	West Virginia	2010	1854230
569	West Virginia	2010	1853973
50	Wisconsin	2010	5690263
570	Wisconsin	2010	5691047
571	Wyoming	2010	564460
51	Wyoming	2010	564513

	State	Year	Population
359	Virginia	2016	8411808
360	Washington	2016	7288000
361	West Virginia	2016	1831102
362	Wisconsin	2016	5778708
363	Wyoming	2016	585501